## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naveed Aamir\AppData\Local\Temp\RtmpmspurR\downloaded_packages
## package 'knitr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naveed Aamir\AppData\Local\Temp\RtmpmspurR\downloaded_packages
## package 'readr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naveed Aamir\AppData\Local\Temp\RtmpmspurR\downloaded_packages
## package 'dplyr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naveed Aamir\AppData\Local\Temp\RtmpmspurR\downloaded_packages
## package 'broom' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naveed Aamir\AppData\Local\Temp\RtmpmspurR\downloaded_packages
## package 'forecast' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naveed Aamir\AppData\Local\Temp\RtmpmspurR\downloaded_packages
## package 'repr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naveed Aamir\AppData\Local\Temp\RtmpmspurR\downloaded_packages
## package 'lubridate' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naveed Aamir\AppData\Local\Temp\RtmpmspurR\downloaded_packages
## package 'janitor' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naveed Aamir\AppData\Local\Temp\RtmpmspurR\downloaded_packages
## package 'plotly' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naveed Aamir\AppData\Local\Temp\RtmpmspurR\downloaded_packages

1 ASSESSMENT 2

1.1 INTRODUCTION

2 Section Naveed

2.1 Question 1

  • Do people with higher income consume more alcohol and how does the consumption of alcohol vary with an increase in income?
options(scipen = 999)


# Wrangle and get the data ready for the model
alcohol2 <- filter(alcohol, GDP_per_capita>=0, GDP_per_capita<=70000) %>% select(Entity,GDP_per_capita,Total_alcohol_consumption_per_capita)

alcohol3 <- na.omit(alcohol2)

# scatter plot of consumption per person vs income
sp <- ggplot(data = alcohol3, aes(x = GDP_per_capita, y = Total_alcohol_consumption_per_capita)) +
  geom_point() + labs(title = "Pure Alcohol Consumption per person with Income",x = "GDP Per Capita (USD)",y = "Pure Alcohol Consumption per Capita (Litres)",subtitle = "Consumption of alcohol per person increases with Income and eventually becomes constant", caption = "Alcohol Consumption Data 2000-2020")



spi <- ggplotly(sp) %>% layout(title = list(text = paste0("Variation in Pure Alcohol Consumption per person with Income",
                                                          "<br>",
                                                          "<sup>",
                                                          "Consumption of alcohol per person increases with Income and eventually becomes constant",
                                                          "</sup>")), x = "Income per person (USD)", y = "Pure Alcohol Consumption per person (Litres)",
                               annotations = list(x=1, y=-0.1, text = "Alcohol Consumption Data 2000-2020", showarrow = F, xref = 'paper', yref = 'paper', xanchor = 'right', yanchor = 'auto', xshift = 0, yshift = 0, font = list(size=10, colour="red")))
spi

Figure 2.1: plot1

# *****
# Fit a simple linear regression model of consumption per person on income
con_fit <- lm(Total_alcohol_consumption_per_capita~GDP_per_capita, data=alcohol3)
#tidy(con_fit)
# Add predictions, residuals, etc. to the training data
con_model <- augment(con_fit, alcohol3)

# Fitting the estimated model over the data
con_mod <- ggplot(con_model, aes(x=GDP_per_capita, y=Total_alcohol_consumption_per_capita)) + 
  geom_point() + labs(title = "Linear Model of Pure Alcohol Consumption per person on Income",x = "GDP Per Capita (USD)",y = "Pure Alcohol Consumption per Capita (Litres)",subtitle = "The relationship between Consumption per person and Income is not linear", caption = "Alcohol Consumption Data 2000-2020") + geom_line(aes(y=.fitted), colour="blue")
  
con_modi <- ggplotly(con_mod) %>% layout(title = list(text = paste0("Linear Model of Pure Alcohol Consumption per person on Income",
                                                          "<br>",
                                                          "<sup>",
                                                          "The relationship between Consumption per person and Income is not linear",
                                                          "</sup>")), x = "GDP Per Capita (USD)", y = "Pure Alcohol Consumption per person (Litres)",
                               annotations = list(x=1, y=-0.1, text = "Alcohol Consumption Data 2000-2020", showarrow = F, xref = 'paper', yref = 'paper', xanchor = 'right', yanchor = 'auto', xshift = 0, yshift = 0, font = list(size=10, colour="red")))

options(scipen = 999)
con_modi

Figure 2.2: plot2

# *****
# Summary of the fitted linear regression model
con_lin_fit <- lm(formula = Total_alcohol_consumption_per_capita ~ GDP_per_capita, data = alcohol3)

# Return a summary of the fitted linear regression model
summary(con_lin_fit)
## 
## Call:
## lm(formula = Total_alcohol_consumption_per_capita ~ GDP_per_capita, 
##     data = alcohol3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.9376  -2.4253   0.0752   2.3196  13.2824 
## 
## Coefficients:
##                   Estimate  Std. Error t value            Pr(>|t|)    
## (Intercept)    4.322541227 0.226113925   19.12 <0.0000000000000002 ***
## GDP_per_capita 0.000105162 0.000009377   11.21 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.683 on 568 degrees of freedom
## Multiple R-squared:  0.1813, Adjusted R-squared:  0.1798 
## F-statistic: 125.8 on 1 and 568 DF,  p-value: < 0.00000000000000022
# *****
# Examine outliers
alc_outliers <- arrange(alcohol3, desc(Total_alcohol_consumption_per_capita))
# *****
# Table that displays five countries each for both highest gdp per capita and lowest gdp per capita
alcohol10 <- arrange(alcohol3, desc(GDP_per_capita))

alcohol_gdp <- filter(alcohol10, Entity %in% c("Switzerland", "Brunei", "United Arab Emirates", "Norway", "United States", "Burundi", "Central African Republic", "Democratic Republic of Congo", "Somalia", "Mozambique"))

knitr::kable(alcohol_gdp, caption = "Some countries with one of the highest and lowest GDP Per Capita and their corresponding alcohol consummption per capita")
Table 2.1: Some countries with one of the highest and lowest GDP Per Capita and their corresponding alcohol consummption per capita
Entity GDP_per_capita Total_alcohol_consumption_per_capita
Switzerland 68025.9219 11.580
Brunei 67753.4219 0.510
United Arab Emirates 66968.2188 3.900
Switzerland 65819.7344 11.410
United Arab Emirates 65267.4141 3.790
Norway 64341.2578 7.410
Brunei 63147.4844 0.460
Norway 62987.5156 7.520
United States 61585.7578 9.870
Norway 61353.4766 8.940
Brunei 60389.1836 0.480
United States 58540.2969 9.820
United Arab Emirates 54921.7969 3.150
United States 54315.9141 9.360
Mozambique 1289.7360 2.300
Mozambique 1262.6132 2.190
Central African Republic 1201.3522 3.980
Somalia 1129.5315 0.009
Democratic Republic of Congo 1085.8937 2.000
Democratic Republic of Congo 1065.2424 2.050
Mozambique 1027.2089 1.630
Somalia 1026.0382 0.013
Central African Republic 933.1099 2.380
Democratic Republic of Congo 865.6840 1.750
Central African Republic 852.7492 2.430
Burundi 846.1691 7.660
Burundi 825.2057 7.030
Burundi 761.5241 7.190
# *****
  • The scatter plot in (Figure 2.1) does show that, in general, consumption of alcohol is found to be higher for people with higher income.

  • At first, the consumption of alcohol increases exponentially as income increases and then becomes approximately constant around and beyond a GDP Per Capita of USD 20000.

  • The summary table of the linear regression model of consumption per person on income in shows that for an increase of USD 1000 in income the consumption of alcohol per person increases by 105.2 ml

  • The overall p-value and the one for the variable GDP Per Capita is much lower than the usual significance level of 5%. which shows that alcohol consumption per person is dependant on income.

  • However, the linear regression model only explains about 18.13% of the variation in alcohol consumption per person, which suggests that the relationship between alcohol consumption per person and income may not be linear and the model is not very accurate.

  • This is confirmed by the plot of the fitted linear regression model in (Figure 2.2), which also shows that most of the observations lie above the line of best fit and suggests a curved relationship instead.

  • Therefore, the relationship is not linear and the model can be optimized further.

  • With a GDP Per Capita of approximately USD 27500, Seychelles is an outlier Ritchie et al. [2018]. and has the highest alcohol consumption per person of 20.5 L, further research may confirm if this because Seychelles is a very popular tourist and retirement spot.

  • With a GDP Per Capita of approximately USD 27600, Malaysia is also an outlier Ritchie et al. [2018]. and has one of the lowest alcohol consumption per person of 0.85 L, further research may confirm if this because of religious influence, lack of preference and a lack of affordability in Malaysia.

  • The table in 2.1 shows countries with various GDP Per Capita and reiterates the fact that countries with higher income such as Switzerland, Norway and the United States have a higher alcohol consumption per person compared to countries with lower income such as Congo, Somalia and Mozambique. This table also confirms the earlier analysis that after a particular income level the alcohol consumption per person is relatively constant.

  • However, further research into the outlier Burundi with a GDP Per Capita of approximately only USD 800 but a nearly 400% higher alcohol consumption per person compared to the other lower income countries mentioned above, may suggest it to be the result of cultural influence.

  • Similarly, both the outliers UAE and Brunei as compared to the other countries with high income mentioned above have a comparatively low alcohol consumption per person. Further research may help to determine if this a result of religious influence in the respective countries.

2.2 Question 2

  • Is death dependant on consumption, income and alcohol household expense? Which countries had the highest percentage of deaths from alcohol use disorders in 2015? Are there any interesting findings and any possible potential underlying reasons?
# select variables for regression
selected.var <- c(4, 6, 11, 12)
# partition data
set.seed(1)  # set seed for reproducing the partition
train.index <- sample(c(1:3618), 2000) 
train.df <- alcohol[train.index, selected.var]
test.df <- alcohol[-train.index, selected.var]

# death regress on consumption, income and alcohol household expense
death_m <- lm(Death_alcohol_use_disorders ~., data = train.df)
options(scipen = 999)
summary(death_m)
## 
## Call:
## lm(formula = Death_alcohol_use_disorders ~ ., data = train.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.7650 -2.3255 -0.5326  1.9368  6.2142 
## 
## Coefficients:
##                                           Estimate  Std. Error t value Pr(>|t|)
## (Intercept)                            -3.32124062  2.44233274  -1.360 0.190664
## GDP_per_capita                         -0.00015013  0.00005195  -2.890 0.009751
## Alcohol_expenditure_rate_per_household  0.49992440  1.11396453   0.449 0.658947
## Total_alcohol_consumption_per_capita    1.17868767  0.28556502   4.128 0.000632
##                                           
## (Intercept)                               
## GDP_per_capita                         ** 
## Alcohol_expenditure_rate_per_household    
## Total_alcohol_consumption_per_capita   ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.787 on 18 degrees of freedom
##   (1978 observations deleted due to missingness)
## Multiple R-squared:  0.5665, Adjusted R-squared:  0.4942 
## F-statistic: 7.841 on 3 and 18 DF,  p-value: 0.001487
# *****
# % death in alcohol use disorders vs alcohol consumption per capita
sp_death_con <- ggplot(data = alcohol, aes(x = Total_alcohol_consumption_per_capita, y = Death_alcohol_use_disorders)) +
  geom_point() + labs(title = "Variation in death in alcohol use disorders",x = "Pure Alcohol Consumption per Capita (Litres)",y = "Death in alcohol use disorders(%)",subtitle = "Percentage death increases with an increase in alcohol consumption per person", caption = "Alcohol Consumption Data 2000-2020")



sp_death_coni <- ggplotly(sp_death_con) %>% layout(title = list(text = paste0("Variation in death in alcohol use disorders",
                                                          "<br>",
                                                          "<sup>",
                                                          "Percentage death increases with an increase in alcohol consumption per person",
                                                          "</sup>")), x = "Pure Alcohol Consumption per Capita (Litres)", y = "Death in alcohol use disorders(%)",
                               annotations = list(x=1, y=-0.1, text = "Alcohol Consumption Data 2000-2020", showarrow = F, xref = 'paper', yref = 'paper', xanchor = 'right', yanchor = 'auto', xshift = 0, yshift = 0, font = list(size=10, colour="red")))


sp_death_coni

Figure 2.3: plotA

# % death in alcohol use disorders vs GDP Per Capita
sp_death_gdp <- ggplot(data = alcohol, aes(x = GDP_per_capita, y = Death_alcohol_use_disorders)) +
  geom_point() + labs(title = "Variation in death in alcohol use disorders with increase in income",x = "GDP Per Capita (USD)",y = "Death in alcohol use disorders(%)",subtitle = "Percentage death decreases with an increase in GDP Per Capita", caption = "Alcohol Consumption Data 2000-2020")

sp_death_gdpi <- ggplotly(sp_death_gdp) %>% layout(title = list(text = paste0("Variation in death in alcohol use disorders with increase in income",
                                                          "<br>",
                                                          "<sup>",
                                                          "Percentage death decreases with an increase in GDP Per Capita",
                                                          "</sup>")), x = "GDP Per Capita (USD)", y = "Death in alcohol use disorders(%)",
                               annotations = list(x=1, y=-0.1, text = "Alcohol Consumption Data 2000-2020", showarrow = F, xref = 'paper', yref = 'paper', xanchor = 'right', yanchor = 'auto', xshift = 0, yshift = 0, font = list(size=10, colour="red")))


sp_death_gdpi

Figure 2.4: plotB

# *****
# % death in alcohol use disorders vs household alcohol expense
sp_death_exp <- ggplot(data = alcohol, aes(x = Alcohol_expenditure_rate_per_household, y = Death_alcohol_use_disorders)) +
  geom_point() + labs(title = "Variation in death in alcohol use disorders with increase in household alcohol expense",x = "Household Alcohol Expense (%)",y = "Death in alcohol use disorders(%)",subtitle = "Percentage death increases with an increase in houeshold alcohol expense", caption = "Alcohol Consumption Data 2000-2020")

sp_death_expi <- ggplotly(sp_death_exp) %>% layout(title = list(text = paste0("Variation in death in alcohol use disorders with increase in household alcohol expense",
                                                          "<br>",
                                                          "<sup>",
                                                          "Percentage death increases with an increase in houeshold alcohol expense",
                                                          "</sup>")), x = "Household Alcohol Expense (%)", y = "Death in alcohol use disorders(%)",
                               annotations = list(x=1, y=-0.1, text = "Alcohol Consumption Data 2000-2020", showarrow = F, xref = 'paper', yref = 'paper', xanchor = 'right', yanchor = 'auto', xshift = 0, yshift = 0, font = list(size=10, colour="red")))


sp_death_expi

Figure 2.5: plotC

# *****
# countries in 2015
countries_2015 <- filter(alcohol, Year == 2015) %>% select(Entity, Death_alcohol_use_disorders, Total_alcohol_consumption_per_capita, GDP_per_capita, Alcohol_expenditure_rate_per_household) 

# top 7 countries with highest percentage of alcohol related deaths in 2015
countries_deaths <- filter(countries_2015, Death_alcohol_use_disorders >= 12.5, Death_alcohol_use_disorders <= 22) 

countries_deaths_highest <-  arrange(countries_deaths, desc(Death_alcohol_use_disorders))

knitr::kable(countries_deaths_highest, caption = "The top 7 countries with the highest percentage of deaths in alcohol use disorders in 2015")
Table 2.2: The top 7 countries with the highest percentage of deaths in alcohol use disorders in 2015
Entity Death_alcohol_use_disorders Total_alcohol_consumption_per_capita GDP_per_capita Alcohol_expenditure_rate_per_household
Belarus 21.86 12.00 18362.746 NA
Mongolia 16.48 10.97 11009.715 NA
El Salvador 15.01 3.59 8143.326 NA
Russia 14.20 12.47 25488.096 1.8
Greenland 13.18 NA NA NA
Saint Kitts and Nevis 13.13 8.88 23966.498 NA
Guatemala 12.58 2.55 8125.656 NA
# *****
  • The Multiple Linear Regression model of percentage death (in alcohol use disorders) regressed on consumption, income and household alcohol expense has been used to investigate Question 2.

  • The model summary table, shows that a 1 Litre increase in annual pure alcohol consumption per person increases the death percentage in alcohol use disorder by 1.179%.

  • However, an increase of USD 10000 in GDP Per Capita decreases the percentage death in alcohol use disorder by 1.5% and an increase of 1% in the weekly household alcohol expense increases the percentage death in alcohol use disorder by 0.5%

  • The overall p-value is lower than the usual significance value of 5%, which suggests that percentage death in alcohol use disorders is dependant on the predictors present in the model.

  • The Multiple R-Squared value in the model summary table shows that the model explains 56.65% of the variation in percentage deaths in alcohol use disorders. Therefore, the model may be considered to be accurate, however, there is space for improvement.

  • The scatter plot in (Figure 2.3) of deaths from alcohol use disorders vs alcohol consumption per person shows that at first the percentage death increases slightly with an increase in consumption but then it increases exponentially beyond approximately 7.5 litres.

  • The scatter plot in (Figure 2.4) of deaths from alcohol use disorders vs GDP Per Capita shows that the percentage death is higher for countries with low GDP Per Capita and then the percentage death decreases exponentially as GDP Per Capita increases till it eventually almost becomes constant.

  • Further research may determine whether this decline in percentage deaths may be a result of the availability of safer and better quality alcohol for consumption, better individual health, quality of healthcare and affordability of better healthcare. These may also be some of the additional predictors that may help explain the remaining 55% of variation in deaths from alcohol use disorders.

  • The scatter plot in (Figure 2.5) of deaths from alcohol use disorders vs household alcohol expense shows that the percentage death is about constant at first but then increases exponentially beyond a household alcohol expense of 1.5%. This is contrary to the earlier description of the relationship between the same two variables by the Multiple Linear Regression model. Some of the observed outliers may be a result of the additional predictors mentioned above. This scatter plot is also reflective of the deaths from alcohol use disorders vs alcohol consumption per person in Fig B and, therefore, reaffirms that percentage death is dependant on the alcohol consumption per person.

  • The above scatter plots also help explain why the Multiple Linear Regression model used explains only approximately 57% of the variation in percentage deaths in alcohol use orders because the relation between the dependant and independant variables does not appear to be linear. Therefore, the model should be optimized further.

  • The table in 2.2, shows two of the outliers El Salvador and Gautemala, both of which had one of the highest percentage deaths in alcohol use disorders in 2015 but one of the lowest alcohol consumption per capita of 3.59 litres and 2.55 litres respectively. Further research may help determine whether some of the additional predictors mentioned above such as consumption of unsafe and low quality alcohol, comparatively poorer individual health, lack of quality and affordable healthcare contributed to the high percentage of deaths in alcohol use disorders for these countries.

3 Section Jane

3.1 Question 1

3.2 Question 2

4 Section Tony

4.1 Question 1

4.2 Question 2

5 Section Gloria

5.1 Question 1

5.2 Question 2

6 Conclusion

7 References

8 Appendix